1.0 Getting Started

Note that to make it easier to manage the results for this capstone project, the different tasks (ie. exploratory data analysis and supervised learning) in capstone project will be split into different HTML result files.

This HTML file will focus on Exploratory Data Analysis.

1.1 Setup R Environment for Later Analysis

Call the relevant packages to setup the environment.

packages <- c('tidyverse', 'lubridate', 'funModeling', 'corrplot', 'tibble', 'skimr', 'plotly', 'reshape2', 'tidytext', 'tm', 'ggstatsplot', 'ggmosaic','readxl', 'gridExtra', 'stopwords', 'ggwordcloud', 'stringr', 'infer', 'tictoc', 'beepr', 'PMCMRplus')

for (p in packages){
  if(!require (p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Set the random seed for reproducibility

set.seed(123)

2.0 Exploratory Data Analysis (EDA)

In this section, I will perform various EDA to ensure the dataset is good enough for us to build machine learning models in a later steps.

2.1 Import Data

data <- read_csv("data/actuarial_loss_train.csv") %>%
  rename(InitialIncurredClaimCost = InitialIncurredCalimsCost)
## 
## -- Column specification --------------------------------------------------------
## cols(
##   ClaimNumber = col_character(),
##   DateTimeOfAccident = col_datetime(format = ""),
##   DateReported = col_datetime(format = ""),
##   Age = col_double(),
##   Gender = col_character(),
##   MaritalStatus = col_character(),
##   DependentChildren = col_double(),
##   DependentsOther = col_double(),
##   WeeklyWages = col_double(),
##   PartTimeFullTime = col_character(),
##   HoursWorkedPerWeek = col_double(),
##   DaysWorkedPerWeek = col_double(),
##   ClaimDescription = col_character(),
##   InitialIncurredCalimsCost = col_double(),
##   UltimateIncurredClaimCost = col_double()
## )
data_addCol <- data %>%
  mutate(day_diff = difftime(DateReported, as.Date(DateTimeOfAccident), units = "days"),
         day_diff = as.numeric(day_diff),
         init_ult_diff = UltimateIncurredClaimCost - InitialIncurredClaimCost,
         acc_yr = factor(year(DateTimeOfAccident), order = TRUE),
         acc_qtr = factor(quarter(DateTimeOfAccident), order = TRUE), 
         acc_mth = factor(month(DateTimeOfAccident), order = TRUE),
         report_yr = factor(year(DateReported), order = TRUE),
         report_qtr = factor(quarter(DateReported), order = TRUE),
         report_mth = factor(month(DateReported), order = TRUE),
         acc_hr = factor(hour(DateTimeOfAccident), order = TRUE),
         num_week_paid_init = InitialIncurredClaimCost/WeeklyWages,
         num_week_paid_ult = UltimateIncurredClaimCost/WeeklyWages)

2.1 Data Quality Check

This is the base R function to check the data quality. The output from this function is often not sufficient.

summary(data_addCol)
##  ClaimNumber        DateTimeOfAccident             DateReported                
##  Length:54000       Min.   :1988-01-01 09:00:00   Min.   :1988-01-08 00:00:00  
##  Class :character   1st Qu.:1992-06-30 07:00:00   1st Qu.:1992-08-04 00:00:00  
##  Mode  :character   Median :1997-01-07 10:00:00   Median :1997-02-16 00:00:00  
##                     Mean   :1997-01-03 05:08:31   Mean   :1997-02-11 01:00:17  
##                     3rd Qu.:2001-07-09 11:00:00   3rd Qu.:2001-08-25 00:00:00  
##                     Max.   :2005-12-31 10:00:00   Max.   :2006-09-23 00:00:00  
##                                                                                
##       Age           Gender          MaritalStatus      DependentChildren
##  Min.   :13.00   Length:54000       Length:54000       Min.   :0.0000   
##  1st Qu.:23.00   Class :character   Class :character   1st Qu.:0.0000   
##  Median :32.00   Mode  :character   Mode  :character   Median :0.0000   
##  Mean   :33.84                                         Mean   :0.1192   
##  3rd Qu.:43.00                                         3rd Qu.:0.0000   
##  Max.   :81.00                                         Max.   :9.0000   
##                                                                         
##  DependentsOther     WeeklyWages     PartTimeFullTime   HoursWorkedPerWeek
##  Min.   :0.000000   Min.   :   1.0   Length:54000       Min.   :  0.00    
##  1st Qu.:0.000000   1st Qu.: 200.0   Class :character   1st Qu.: 38.00    
##  Median :0.000000   Median : 392.2   Mode  :character   Median : 38.00    
##  Mean   :0.009944   Mean   : 416.4                      Mean   : 37.74    
##  3rd Qu.:0.000000   3rd Qu.: 500.0                      3rd Qu.: 40.00    
##  Max.   :5.000000   Max.   :7497.0                      Max.   :640.00    
##                                                                           
##  DaysWorkedPerWeek ClaimDescription   InitialIncurredClaimCost
##  Min.   :1.000     Length:54000       Min.   :      1         
##  1st Qu.:5.000     Class :character   1st Qu.:    700         
##  Median :5.000     Mode  :character   Median :   2000         
##  Mean   :4.906                        Mean   :   7841         
##  3rd Qu.:5.000                        3rd Qu.:   9500         
##  Max.   :7.000                        Max.   :2000000         
##                                                               
##  UltimateIncurredClaimCost    day_diff       init_ult_diff          acc_yr     
##  Min.   :    122           Min.   :   0.00   Min.   :-1912737   1996   : 3034  
##  1st Qu.:    926           1st Qu.:  14.00   1st Qu.:    -536   2004   : 3032  
##  Median :   3371           Median :  22.00   Median :      82   1997   : 3031  
##  Mean   :  11003           Mean   :  39.31   Mean   :    3162   2001   : 3028  
##  3rd Qu.:   8197           3rd Qu.:  41.00   3rd Qu.:    1417   1988   : 3023  
##  Max.   :4027136           Max.   :1095.00   Max.   : 3984136   1992   : 3022  
##                                                                 (Other):35830  
##  acc_qtr      acc_mth        report_yr     report_qtr   report_mth   
##  1:13357   5      : 4956   2002   : 3078   1:12761    3      : 4905  
##  2:13545   3      : 4748   2004   : 3054   2:13906    5      : 4822  
##  3:13690   7      : 4693   2001   : 3052   3:13596    6      : 4709  
##  4:13408   11     : 4677   1996   : 3029   4:13737    10     : 4654  
##            8      : 4671   1998   : 3029              9      : 4626  
##            10     : 4655   2005   : 3023              11     : 4623  
##            (Other):25600   (Other):35735              (Other):25661  
##      acc_hr      num_week_paid_init num_week_paid_ult 
##  12     : 6612   Min.   :     0.0   Min.   :     0.3  
##  11     : 6186   1st Qu.:     2.1   1st Qu.:     2.7  
##  10     : 6118   Median :     5.8   Median :     9.0  
##  9      : 5265   Mean   :    60.2   Mean   :    61.4  
##  14     : 4809   3rd Qu.:    21.1   3rd Qu.:    23.1  
##  8      : 4558   Max.   :690000.0   Max.   :525713.3  
##  (Other):20452

Three approaches (ie. status, profiling_num & skim) are used to check the quality of the data. This is because all approaches contain different measurements in the result file, i.e. complement each other.

status(data_addCol)
##                     variable q_zeros      p_zeros q_na        p_na q_inf p_inf
## 1                ClaimNumber       0 0.000000e+00    0 0.000000000     0     0
## 2         DateTimeOfAccident       0 0.000000e+00    0 0.000000000     0     0
## 3               DateReported       0 0.000000e+00    0 0.000000000     0     0
## 4                        Age       0 0.000000e+00    0 0.000000000     0     0
## 5                     Gender       0 0.000000e+00    0 0.000000000     0     0
## 6              MaritalStatus       0 0.000000e+00   29 0.000537037     0     0
## 7          DependentChildren   50639 9.377593e-01    0 0.000000000     0     0
## 8            DependentsOther   53506 9.908519e-01    0 0.000000000     0     0
## 9                WeeklyWages       0 0.000000e+00    0 0.000000000     0     0
## 10          PartTimeFullTime       0 0.000000e+00    0 0.000000000     0     0
## 11        HoursWorkedPerWeek      29 5.370370e-04    0 0.000000000     0     0
## 12         DaysWorkedPerWeek       0 0.000000e+00    0 0.000000000     0     0
## 13          ClaimDescription       0 0.000000e+00    0 0.000000000     0     0
## 14  InitialIncurredClaimCost       0 0.000000e+00    0 0.000000000     0     0
## 15 UltimateIncurredClaimCost       0 0.000000e+00    0 0.000000000     0     0
## 16                  day_diff       5 9.259259e-05    0 0.000000000     0     0
## 17             init_ult_diff       0 0.000000e+00    0 0.000000000     0     0
## 18                    acc_yr       0 0.000000e+00    0 0.000000000     0     0
## 19                   acc_qtr       0 0.000000e+00    0 0.000000000     0     0
## 20                   acc_mth       0 0.000000e+00    0 0.000000000     0     0
## 21                 report_yr       0 0.000000e+00    0 0.000000000     0     0
## 22                report_qtr       0 0.000000e+00    0 0.000000000     0     0
## 23                report_mth       0 0.000000e+00    0 0.000000000     0     0
## 24                    acc_hr     688 1.274074e-02    0 0.000000000     0     0
## 25        num_week_paid_init       0 0.000000e+00    0 0.000000000     0     0
## 26         num_week_paid_ult       0 0.000000e+00    0 0.000000000     0     0
##              type unique
## 1       character  54000
## 2  POSIXct/POSIXt  36673
## 3  POSIXct/POSIXt   6653
## 4         numeric     68
## 5       character      3
## 6       character      3
## 7         numeric      9
## 8         numeric      5
## 9         numeric  13211
## 10      character      2
## 11        numeric    424
## 12        numeric      7
## 13      character  28114
## 14        numeric   1989
## 15        numeric  53999
## 16        numeric    574
## 17        numeric  53999
## 18 ordered-factor     18
## 19 ordered-factor      4
## 20 ordered-factor     12
## 21 ordered-factor     19
## 22 ordered-factor      4
## 23 ordered-factor     12
## 24 ordered-factor     24
## 25        numeric  28681
## 26        numeric  53999
profiling_num(data_addCol)
##                     variable         mean      std_dev variation_coef
## 1                        Age 3.384237e+01 1.212216e+01      0.3581949
## 2          DependentChildren 1.191852e-01 5.177800e-01      4.3443321
## 3            DependentsOther 9.944444e-03 1.093475e-01     10.9958422
## 4                WeeklyWages 4.163648e+02 2.486387e+02      0.5971654
## 5         HoursWorkedPerWeek 3.773508e+01 1.256870e+01      0.3330774
## 6          DaysWorkedPerWeek 4.905759e+00 5.521291e-01      0.1125471
## 7   InitialIncurredClaimCost 7.841146e+03 2.058408e+04      2.6251360
## 8  UltimateIncurredClaimCost 1.100337e+04 3.339099e+04      3.0346152
## 9                   day_diff 3.931002e+01 6.110810e+01      1.5545171
## 10             init_ult_diff 3.162223e+03 3.155454e+04      9.9785950
## 11        num_week_paid_init 6.017575e+01 3.145609e+03     52.2736954
## 12         num_week_paid_ult 6.141505e+01 2.382575e+03     38.7946389
##             p_01          p_05        p_25        p_50       p_75        p_95
## 1   1.600000e+01    18.0000000   23.000000   32.000000   43.00000    56.00000
## 2   0.000000e+00     0.0000000    0.000000    0.000000    0.00000     1.00000
## 3   0.000000e+00     0.0000000    0.000000    0.000000    0.00000     0.00000
## 4   4.200000e+01   200.0000000  200.000000  392.200000  500.00000   817.00550
## 5   1.000000e+01    22.3815000   38.000000   38.000000   40.00000    40.00000
## 6   2.000000e+00     4.0000000    5.000000    5.000000    5.00000     5.00000
## 7   1.600000e+02   315.0000000  700.000000 2000.000000 9500.00000 30000.00000
## 8   2.079110e+02   306.5569284  926.338449 3371.241730 8197.24865 45224.18444
## 9   3.000000e+00     7.0000000   14.000000   22.000000   41.00000   122.00000
## 10 -2.374680e+04 -5794.3298640 -535.541627   81.834876 1416.99691 17493.04622
## 11  5.000000e-01     0.9308907    2.100000    5.833333   21.10753    83.17124
## 12  7.171505e-01     1.0573824    2.683495    9.025788   23.14330   118.10999
##           p_99    skewness    kurtosis        iqr
## 1      63.0000   0.5363262     2.39387   20.00000
## 2       3.0000   5.1122373    33.00333    0.00000
## 3       0.0000  13.7064537   267.37575    0.00000
## 4    1237.5200   4.1226523    71.01694  300.00000
## 5      60.0000  24.1323038   913.12756    2.00000
## 6       6.0000  -3.3403751    21.23888    0.00000
## 7   75000.0000  26.8529115  1891.10207 8800.00000
## 8  139024.9674  37.5514607  3943.49883 7270.91020
## 9     296.0000   6.5938554    68.33889   27.00000
## 10 106030.8947  36.3375982  5028.69734 1952.53854
## 11    275.0124 198.8139224 42998.78671   19.00753
## 12    448.2588 200.5649610 43911.99460   20.45981
##                                 range_98                          range_80
## 1                               [16, 63]                          [19, 52]
## 2                                 [0, 3]                            [0, 0]
## 3                                 [0, 0]                            [0, 0]
## 4                          [42, 1237.52]                    [200, 681.451]
## 5                               [10, 60]                        [34.5, 40]
## 6                                 [2, 6]                            [5, 5]
## 7                           [160, 75000]                      [500, 18500]
## 8         [207.910970483, 139024.967362]      [423.89078703, 24005.783949]
## 9                               [3, 296]                           [9, 78]
## 10       [-23746.8021087, 106030.894664]     [-3561.5234437, 4783.4900092]
## 11               [0.5, 275.012430939229]             [1.2671001480933, 50]
## 12 [0.717150487531148, 448.258842773946] [1.36353682761, 59.2646574748529]
skim(data_addCol)
Data summary
Name data_addCol
Number of rows 54000
Number of columns 26
_______________________
Column type frequency:
character 5
factor 7
numeric 12
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ClaimNumber 0 1 9 9 0 54000 0
Gender 0 1 1 1 0 3 0
MaritalStatus 29 1 1 1 0 3 0
PartTimeFullTime 0 1 1 1 0 2 0
ClaimDescription 0 1 3 94 0 28114 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
acc_yr 0 1 TRUE 18 199: 3034, 200: 3032, 199: 3031, 200: 3028
acc_qtr 0 1 TRUE 4 3: 13690, 2: 13545, 4: 13408, 1: 13357
acc_mth 0 1 TRUE 12 5: 4956, 3: 4748, 7: 4693, 11: 4677
report_yr 0 1 TRUE 19 200: 3078, 200: 3054, 200: 3052, 199: 3029
report_qtr 0 1 TRUE 4 2: 13906, 4: 13737, 3: 13596, 1: 12761
report_mth 0 1 TRUE 12 3: 4905, 5: 4822, 6: 4709, 10: 4654
acc_hr 0 1 TRUE 24 12: 6612, 11: 6186, 10: 6118, 9: 5265

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1 33.84 12.12 13.00 23.00 32.00 43.00 81.0 ▇▇▅▂▁
DependentChildren 0 1 0.12 0.52 0.00 0.00 0.00 0.00 9.0 ▇▁▁▁▁
DependentsOther 0 1 0.01 0.11 0.00 0.00 0.00 0.00 5.0 ▇▁▁▁▁
WeeklyWages 0 1 416.36 248.64 1.00 200.00 392.20 500.00 7497.0 ▇▁▁▁▁
HoursWorkedPerWeek 0 1 37.74 12.57 0.00 38.00 38.00 40.00 640.0 ▇▁▁▁▁
DaysWorkedPerWeek 0 1 4.91 0.55 1.00 5.00 5.00 5.00 7.0 ▁▁▁▇▁
InitialIncurredClaimCost 0 1 7841.15 20584.08 1.00 700.00 2000.00 9500.00 2000000.0 ▇▁▁▁▁
UltimateIncurredClaimCost 0 1 11003.37 33390.99 121.89 926.34 3371.24 8197.25 4027135.9 ▇▁▁▁▁
day_diff 0 1 39.31 61.11 0.00 14.00 22.00 41.00 1095.0 ▇▁▁▁▁
init_ult_diff 0 1 3162.22 31554.54 -1912736.70 -535.54 81.83 1417.00 3984135.9 ▁▇▁▁▁
num_week_paid_init 0 1 60.18 3145.61 0.00 2.10 5.83 21.11 690000.0 ▇▁▁▁▁
num_week_paid_ult 0 1 61.42 2382.57 0.31 2.68 9.03 23.14 525713.3 ▇▁▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
DateTimeOfAccident 0 1 1988-01-01 09:00:00 2005-12-31 10:00:00 1997-01-07 10:00:00 36673
DateReported 0 1 1988-01-08 00:00:00 2006-09-23 00:00:00 1997-02-16 00:00:00 6653

Note that this data set only contains policies that claim from the company.

2.3 Outlier Checking

Over here, I have used Tukey outlier method to find the threshold for the values to be considered as outliers.

# Tukey's method
tukey_outlier(data_addCol$UltimateIncurredClaimCost)
## bottom_threshold    top_threshold 
##        -20886.39         30009.98
tukey_outlier(data_addCol$init_ult_diff)
## bottom_threshold    top_threshold 
##        -6393.157         7274.613
tukey_outlier(data_addCol$num_week_paid_init)
## bottom_threshold    top_threshold 
##        -54.92258         78.13011
tukey_outlier(data_addCol$num_week_paid_ult)
## bottom_threshold    top_threshold 
##        -58.69593         84.52272

After finding the thresholds for the outliers, remove the outliers from the data. Note that I am not using the prep_outlier function here as the function capped the outlier values at the max threshold, which doesn’t look correct.

data_1 <- data_addCol %>%
  # remove the hours per week have more than 168 hours
  filter(HoursWorkedPerWeek < 168) %>%
  # remove the outliers from the dataset
  filter(init_ult_diff > -6500,
         init_ult_diff < 7300,
         num_week_paid_init < 80,
         num_week_paid_ult < 85)

Re-check the dataset after remove the outliers from the dataset

skim(data_1)
Data summary
Name data_1
Number of rows 46513
Number of columns 26
_______________________
Column type frequency:
character 5
factor 7
numeric 12
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ClaimNumber 0 1 9 9 0 46513 0
Gender 0 1 1 1 0 3 0
MaritalStatus 27 1 1 1 0 3 0
PartTimeFullTime 0 1 1 1 0 2 0
ClaimDescription 0 1 3 94 0 24693 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
acc_yr 0 1 TRUE 18 198: 2814, 199: 2789, 199: 2771, 199: 2764
acc_qtr 0 1 TRUE 4 3: 11795, 2: 11641, 1: 11605, 4: 11472
acc_mth 0 1 TRUE 12 5: 4251, 3: 4096, 7: 4047, 8: 4009
report_yr 0 1 TRUE 19 199: 2809, 199: 2774, 199: 2768, 198: 2752
report_qtr 0 1 TRUE 4 2: 11998, 4: 11733, 3: 11719, 1: 11063
report_mth 0 1 TRUE 12 3: 4259, 5: 4170, 6: 4057, 9: 4013
acc_hr 0 1 TRUE 24 12: 5516, 11: 5432, 10: 5287, 9: 4534

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1 33.32 11.97 13.00 23.00 31.00 42.00 79.00 ▇▇▅▂▁
DependentChildren 0 1 0.11 0.51 0.00 0.00 0.00 0.00 6.00 ▇▁▁▁▁
DependentsOther 0 1 0.01 0.10 0.00 0.00 0.00 0.00 3.00 ▇▁▁▁▁
WeeklyWages 0 1 407.29 232.26 6.81 200.00 384.00 500.00 7497.00 ▇▁▁▁▁
HoursWorkedPerWeek 0 1 37.60 6.52 0.00 38.00 38.00 40.00 100.00 ▁▇▁▁▁
DaysWorkedPerWeek 0 1 4.92 0.51 1.00 5.00 5.00 5.00 7.00 ▁▁▁▇▁
InitialIncurredClaimCost 0 1 4198.39 5781.93 1.00 550.00 1500.00 7000.00 86000.00 ▇▁▁▁▁
UltimateIncurredClaimCost 0 1 4345.20 5502.90 121.89 776.30 2354.95 6027.01 80192.84 ▇▁▁▁▁
day_diff 0 1 37.99 56.15 0.00 14.00 22.00 40.00 1095.00 ▇▁▁▁▁
init_ult_diff 0 1 146.81 2079.26 -6496.65 -427.16 52.10 1007.51 7297.69 ▁▂▇▂▁
num_week_paid_init 0 1 11.16 14.33 0.00 2.00 4.07 15.57 79.87 ▇▂▁▁▁
num_week_paid_ult 0 1 11.16 12.28 0.31 2.26 6.48 15.87 84.58 ▇▂▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
DateTimeOfAccident 0 1 1988-01-01 09:00:00 2005-12-31 10:00:00 1996-06-15 11:00:00 32910
DateReported 0 1 1988-01-08 00:00:00 2006-07-25 00:00:00 1996-07-23 00:00:00 6619

2.4 Feature Engineering

Text Mining

tidy_clm_unigram <- data_1 %>%
  unnest_tokens(word, ClaimDescription, token = "ngrams", n = 1) %>%
  anti_join(get_stopwords())
## Joining, by = "word"
cleaned_clm_unigram <- tidy_clm_unigram %>%
  count(word, sort = TRUE) %>%
  mutate(cum_count = cumsum(n),
         cum_perc = cum_count/sum(n))
cleaned_clm_unigram %>%
  filter(n > 300) %>%
  ggplot(aes(label = word, size = n, color = n)) +
  geom_text_wordcloud() +
  scale_size_area(max_size = 20) +
  theme_minimal()

Create indicator for various injury types

body_part_list <- c("back", "finger", "hand", "shoulder", "eye", "knee", "wrist", "thumb", "neck", "ankle", "arm", "foot", "leg", "forearm", "elbow", "head")

tidy_clm_unigram_1 <- tidy_clm_unigram

for (i in body_part_list){
  new_col <- paste0("body_", i)
  
  tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
    mutate(!!sym(new_col) := case_when(word == i ~ 1,
                                       TRUE ~ 0))
}


side_list <- c("left", "right", "upper")

for (i in side_list){
  new_col <- paste0("side_", i)
  
  tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
    mutate(!!sym(new_col) := case_when(word == i ~ 1,
                                       TRUE ~ 0))
}


item_list <- c("knife", "metal", "ladder", "door", "floor", "stairs", "machine", "box", "vehicle")

for (i in item_list){
  new_col <- paste0("item_", i)
  
  tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
    mutate(!!sym(new_col) := case_when(word == i ~ 1,
                                       TRUE ~ 0))
}

tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
  mutate(item_vehicle = case_when(word == c("vehicle", "motor", "truck", "car") ~ 1,
                                  TRUE ~ 0))



injury_cause_list <- c("lifting", "struck", "accident", "slip", "grind", "trip", "fall", "hit", "repetitive")

for (i in injury_cause_list){
  new_col <- paste0("injury_cause_", i)
  
  tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
    mutate(!!sym(new_col) := case_when(word == i ~ 1,
                                       TRUE ~ 0))
}


tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
  mutate(injury_cause_slip = case_when(word == c("slipped", "slip", "slipping") ~ 1,
                                       TRUE ~ 0),
         injury_cause_grind = case_when(word == c("grinding", "grinder") ~ 1,
                                       TRUE ~ 0),
         injury_cause_trip = case_when(word == c("tripped", "trip") ~ 1,
                                      TRUE ~ 0),
         injury_cause_fall = case_when(word == c("fell", "fall", "falling", "fallen") ~ 1,
                                      TRUE ~ 0),
         injury_cause_hit = case_when(word == c("hitting", "hit") ~ 1,
                                     TRUE ~ 0))
## Warning: Problem with `mutate()` input `injury_cause_slip`.
## i longer object length is not a multiple of shorter object length
## i Input `injury_cause_slip` is `case_when(...)`.
## Warning in word == c("slipped", "slip", "slipping"): longer object length is not
## a multiple of shorter object length
tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
  mutate(injury_type_strain = case_when(word == c("strain", "strained") ~ 1,
                                        TRUE ~ 0),
         injury_type_sprain = case_when(word == c("sprain", "sprained") ~ 1,
                                        TRUE ~ 0),
         injury_type_bruise = case_when(word == c("bruised", "bruising", "bruise", "bruises") ~ 1,
                                        TRUE ~ 0),
         injury_type_twist = case_when(word == c("twisted", "twisting", "twist") ~ 1,
                                       TRUE ~ 0),
         injury_type_cut = case_when(word == c("cut", "cutting", "cutter", "cuts") ~ 1,
                                     TRUE ~ 0),
         injury_type_fracture = case_when(word == c("fracture", "fractured") ~ 1,
                                          TRUE ~ 0),
         injury_type_burn = case_when(word == c("burn", "burned", "burns") ~ 1,
                                      TRUE ~ 0))
## Warning: Problem with `mutate()` input `injury_type_twist`.
## i longer object length is not a multiple of shorter object length
## i Input `injury_type_twist` is `case_when(...)`.
## Warning in word == c("twisted", "twisting", "twist"): longer object length is
## not a multiple of shorter object length
## Warning: Problem with `mutate()` input `injury_type_burn`.
## i longer object length is not a multiple of shorter object length
## i Input `injury_type_burn` is `case_when(word == c("burn", "burned", "burns") ~ 1, TRUE ~ 0)`.
## Warning in word == c("burn", "burned", "burns"): longer object length is not a
## multiple of shorter object length

Check whether there is any repeated claim number

check_clm <- data_1 %>%
  group_by(ClaimNumber) %>%
  summarise(count = n())
## `summarise()` ungrouping output (override with `.groups` argument)
data_2 <- data_1

# Join back body part injury
for (i in body_part_list){
  new_col <- paste0("body_", i)
  
  temp <- tidy_clm_unigram_1 %>%
    group_by(ClaimNumber) %>%
    summarise(!!sym(new_col) := case_when(sum(get(paste0("body_", i))) > 0 ~ 1,
                                       TRUE ~ 0))
  data_2 <- data_2 %>%
    left_join(temp, by = c("ClaimNumber"))
  
  rm(temp)
}
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# Join back body side part injury
for (i in side_list){
  new_col <- paste0("side_", i)
  
  temp <- tidy_clm_unigram_1 %>%
    group_by(ClaimNumber) %>%
    summarise(!!sym(new_col) := case_when(sum(get(paste0("side_", i))) > 0 ~ 1,
                                       TRUE ~ 0))
  data_2 <- data_2 %>%
    left_join(temp, by = c("ClaimNumber"))
  
  rm(temp)
}
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# Join back item list
for (i in item_list){
  new_col <- paste0("item_", i)
  
  temp <- tidy_clm_unigram_1 %>%
    group_by(ClaimNumber) %>%
    summarise(!!sym(new_col) := case_when(sum(get(paste0("item_", i))) > 0 ~ 1,
                                       TRUE ~ 0))
  data_2 <- data_2 %>%
    left_join(temp, by = c("ClaimNumber"))
  
  rm(temp)
}
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# Join back injury case
for (i in injury_cause_list){
  new_col <- paste0("injury_cause_", i)
  
  temp <- tidy_clm_unigram_1 %>%
    group_by(ClaimNumber) %>%
    summarise(!!sym(new_col) := case_when(sum(get(paste0("injury_cause_", i))) > 0 ~ 1,
                                       TRUE ~ 0))
  data_2 <- data_2 %>%
    left_join(temp, by = c("ClaimNumber"))
  
  rm(temp)
}
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# Join back injury type
injury_type_list <- c("strain", "sprain", "bruise", "twist", "cut", "fracture", "burn")

# Join back injury type
for (i in injury_type_list){
  new_col <- paste0("injury_type_", i)
  
  temp <- tidy_clm_unigram_1 %>%
    group_by(ClaimNumber) %>%
    summarise(!!sym(new_col) := case_when(sum(get(paste0("injury_type_", i))) > 0 ~ 1,
                                       TRUE ~ 0))
  data_2 <- data_2 %>%
    left_join(temp, by = c("ClaimNumber"))
  
  rm(temp)
}
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
beep("treasure")
# use gather function to covert the one-hot coding into the necessary columns
# note that the conversion splits into two steps
# the reason to split into two steps is because if this is done in one steps, there will be duplicated records
# the code below first gather single injury types
data_3_singleInjury <- data_2 %>%
  mutate(sum_injury = rowSums(across(contains("body_")))) %>%
  gather(injury_body, flag, body_back:body_head) %>%
  filter(flag == 1 & sum_injury == 1) %>%
  dplyr::select(-flag) %>%
  mutate(injury_body = str_replace(injury_body, "body_", ""))

# the code below gathers the remaining types
data_3_multipleInjury <- data_2 %>%
  mutate(sum_injury = rowSums(across(contains("body_")))) %>%
  filter(sum_injury != 1) %>%
  mutate(injury_body = case_when(sum_injury > 1 ~ "multiple",
                                 TRUE ~ "others")) %>%
  dplyr::select(-contains("body_"))

# bind the rows
data_3 <- bind_rows(data_3_singleInjury, data_3_multipleInjury)
rm(data_3_singleInjury, data_3_multipleInjury)


# replicated for other listing

# side list
data_3_singleside <- data_3 %>%
  mutate(sum_side = rowSums(across(contains("side_")))) %>%
  gather(injury_side, flag, side_left:side_upper) %>%
  filter(flag == 1 & sum_side == 1) %>%
  dplyr::select(-flag) %>%
  mutate(injury_side = str_replace(injury_side, "side_", ""))

# the code below gathers the remaining types
data_3_multipleside <- data_3 %>%
  mutate(sum_side = rowSums(across(contains("side_")))) %>%
  filter(sum_side != 1) %>%
  mutate(injury_side = case_when(sum_side > 1 ~ "multiple",
                                 TRUE ~ "others")) %>%
  dplyr::select(-contains("side_"))

# bind the rows
data_3 <- bind_rows(data_3_singleside, data_3_multipleside)
rm(data_3_singleside, data_3_multipleside)



# item list
data_3_singleitem <- data_3 %>%
  mutate(sum_item = rowSums(across(contains("item_")))) %>%
  gather(injury_item, flag, item_knife:item_vehicle) %>%
  filter(flag == 1 & sum_item == 1) %>%
  dplyr::select(-flag) %>%
  mutate(injury_item = str_replace(injury_item, "item_", ""))

# the code below gathers the remaining types
data_3_multipleitem <- data_3 %>%
  mutate(sum_item = rowSums(across(contains("item_")))) %>%
  filter(sum_item != 1) %>%
  mutate(injury_item = case_when(sum_item > 1 ~ "multiple",
                                 TRUE ~ "others")) %>%
  dplyr::select(-contains("item_"))

# bind the rows
data_3 <- bind_rows(data_3_singleitem, data_3_multipleitem)
rm(data_3_singleitem, data_3_multipleitem)


# injury cause
data_3_singleinjurycause <- data_3 %>%
  mutate(sum_injurycause = rowSums(across(contains("injury_cause_")))) %>%
  gather(injury_cause, flag, injury_cause_lifting:injury_cause_repetitive) %>%
  filter(flag == 1 & sum_injurycause == 1) %>%
  dplyr::select(-flag) %>%
  mutate(injury_cause = str_replace(injury_cause, "injury_cause_", ""))

# the code below gathers the remaining types
data_3_multipleinjurycause <- data_3 %>%
  mutate(sum_injurycause = rowSums(across(contains("injury_cause_")))) %>%
  filter(sum_injurycause != 1) %>%
  mutate(injury_cause = case_when(sum_injurycause > 1 ~ "multiple",
                                 TRUE ~ "others")) %>%
  dplyr::select(-contains("injury_cause_"))

# bind the rows
data_3 <- bind_rows(data_3_singleinjurycause, data_3_multipleinjurycause)
rm(data_3_singleinjurycause, data_3_multipleinjurycause)


# injury type
data_3_singleinjurytype <- data_3 %>%
  mutate(sum_injurytype = rowSums(across(contains("injury_type_")))) %>%
  gather(injury_type, flag, injury_type_strain:injury_type_burn) %>%
  filter(flag == 1 & sum_injurytype == 1) %>%
  dplyr::select(-flag) %>%
  mutate(injury_type = str_replace(injury_type, "injury_type_", ""))

# the code below gathers the remaining types
data_3_multipleinjurytype <- data_3 %>%
  mutate(sum_injurytype = rowSums(across(contains("injury_type_")))) %>%
  filter(sum_injurytype != 1) %>%
  mutate(injury_type = case_when(sum_injurytype > 1 ~ "multiple",
                                 TRUE ~ "others")) %>%
  dplyr::select(-contains("injury_type_"))

# bind the rows
data_3 <- bind_rows(data_3_singleinjurytype, data_3_multipleinjurytype) %>%
  dplyr::select(-c(sum_injury,
                   sum_item,
                   sum_injurycause,
                   sum_injurytype,
                   sum_side))

rm(data_3_singleinjurytype, data_3_multipleinjurytype)

2.5 Feature Selection

2.5.1 Frequency Count Plot

2.5.1.1 Target Variable

ggplot(data_3, aes(init_ult_diff)) +
  geom_histogram(stat = "bin")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2.5.1.2 Plot out for all numeric variables

num_var_list <- data_3 %>%
  dplyr::select(where(is.numeric),
                -init_ult_diff) %>%
  names()
for (i in num_var_list){
  print(ggplot(data_3, aes(!!sym(i))) +
          geom_histogram(stat = "bin") +
          labs(title = sym(i)) +
          theme_minimal())
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot the continuous variables against target variable (ie. init_ult_diff)

for (i in num_var_list){
  print(ggplot(data_3, aes(x = !!sym(i), y = init_ult_diff)) +
          geom_point(alpha = 0.1) +
          labs(title = sym(i)) +
          theme_minimal())
}

2.5.1.2 Plot out for all categorical variables

cat_var_list <- data_3 %>%
  dplyr::select(!where(is.numeric),
                -c(ClaimNumber, DateTimeOfAccident, DateReported, ClaimDescription)) %>%
  names()
for (i in cat_var_list){
  print(ggplot(data_3, aes(!!sym(i))) +
          geom_histogram(stat = "count") +
          labs(title = sym(i)) +
          theme_minimal())
}
## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

Plot the categorical variables against target variable (ie. init_ult_diff)

for (i in cat_var_list){
  print(ggplot(data_3, aes(x = !!sym(i), y = init_ult_diff)) +
          geom_boxplot() +
          labs(title = sym(i)) +
          theme_minimal())
}

2.5.2 Relationship between continuous target variable and categorical input variables

summary(aov(init_ult_diff ~ acc_mth, data = data_3))
##                Df    Sum Sq  Mean Sq F value   Pr(>F)    
## acc_mth        11 2.831e+08 25739962   5.961 8.57e-10 ***
## Residuals   46501 2.008e+11  4318265                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat_var_list_2 <- data_3 %>%
  dplyr::select(!where(is.numeric),
                -c(ClaimNumber, DateTimeOfAccident, DateReported, ClaimDescription, acc_yr, report_yr)) %>%
  names()



for (i in cat_var_list_2){
  assign(paste0("graph_clmdiff_", i),
         ggbetweenstats(data_3,
                        x = !!sym(i),
                        y = init_ult_diff,
                        pairwise.comparisons = TRUE,
                        title = paste0("ANOVA Test on ", i," vs init_ult_diff"),
                        ggtheme = ggplot2::theme(axis.text.x = element_text(angle = 90)),
                        package = "RColorBrewer",
                        palette = "Set3"))
  
  print(get(paste0("graph_clmdiff_", i)))
}

## Warning: Number of labels is greater than default palette color count.
##  Try using another color `palette` (and/or `package`).
## 

## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Number of labels is greater than default palette color count.
##  Try using another color `palette` (and/or `package`).
## 

2.5.3 Relationship between continuous target variable and continuous input variables

Correlation

data_1_num <- data_1 %>%
  dplyr::select(where(is.numeric))
corrplot(cor(data_1_num, use="pairwise.complete.obs"), 
         method = "number", 
         type = "upper", 
         tl.cex = 0.65, 
         number.cex = 0.65, 
         diag = FALSE)

The correlation between numeric variables and outcome is rather weak.

write_csv(data_3, "data/data_eda_actLoss_3.csv")

beep("mario")
#save.image(file = "data/MITBCapstone_Jasper_actLoss.RData")
#load(file = "data/MITBCapstone_Jasper_actLoss.RData")